{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SQLContext\n",
    "from pyspark import SparkContext\n",
    "sc =SparkContext()\n",
    "sqlContext = SQLContext(sc)\n",
    "\n",
    "data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('train.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Dates',\n",
       " 'Category',\n",
       " 'Descript',\n",
       " 'DayOfWeek',\n",
       " 'PdDistrict',\n",
       " 'Resolution',\n",
       " 'Address',\n",
       " 'X',\n",
       " 'Y']"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']\n",
    "\n",
    "data = data.select([column for column in data.columns if column not in drop_list])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+--------------------+\n",
      "|      Category|            Descript|\n",
      "+--------------+--------------------+\n",
      "|      WARRANTS|      WARRANT ARREST|\n",
      "|OTHER OFFENSES|TRAFFIC VIOLATION...|\n",
      "|OTHER OFFENSES|TRAFFIC VIOLATION...|\n",
      "| LARCENY/THEFT|GRAND THEFT FROM ...|\n",
      "| LARCENY/THEFT|GRAND THEFT FROM ...|\n",
      "+--------------+--------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- Category: string (nullable = true)\n",
      " |-- Descript: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----+\n",
      "|            Category|count|\n",
      "+--------------------+-----+\n",
      "|       LARCENY/THEFT| 1933|\n",
      "|        NON-CRIMINAL|  929|\n",
      "|      OTHER OFFENSES|  907|\n",
      "|             ASSAULT|  634|\n",
      "|       VEHICLE THEFT|  460|\n",
      "|           VANDALISM|  346|\n",
      "|            BURGLARY|  292|\n",
      "|            WARRANTS|  284|\n",
      "|      SUSPICIOUS OCC|  242|\n",
      "|      MISSING PERSON|  223|\n",
      "|             ROBBERY|  201|\n",
      "|       DRUG/NARCOTIC|  185|\n",
      "|               FRAUD|  139|\n",
      "|     SECONDARY CODES|   96|\n",
      "|         WEAPON LAWS|   90|\n",
      "|            TRESPASS|   59|\n",
      "|SEX OFFENSES FORC...|   44|\n",
      "|     STOLEN PROPERTY|   37|\n",
      "|         DRUNKENNESS|   29|\n",
      "|FORGERY/COUNTERFE...|   28|\n",
      "+--------------------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql.functions import col\n",
    "\n",
    "# by top 20 categories\n",
    "data.groupBy(\"Category\") \\\n",
    "    .count() \\\n",
    "    .orderBy(col(\"count\").desc()) \\\n",
    "    .show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----+\n",
      "|            Descript|count|\n",
      "+--------------------+-----+\n",
      "|GRAND THEFT FROM ...|  836|\n",
      "|   STOLEN AUTOMOBILE|  293|\n",
      "|AIDED CASE, MENTA...|  275|\n",
      "|DRIVERS LICENSE, ...|  226|\n",
      "|             BATTERY|  207|\n",
      "|PETTY THEFT FROM ...|  191|\n",
      "|PETTY THEFT OF PR...|  190|\n",
      "|       LOST PROPERTY|  170|\n",
      "|MALICIOUS MISCHIE...|  164|\n",
      "|      WARRANT ARREST|  162|\n",
      "|      FOUND PROPERTY|  143|\n",
      "|MALICIOUS MISCHIE...|  122|\n",
      "|SUSPICIOUS OCCURR...|  121|\n",
      "|GRAND THEFT FROM ...|  111|\n",
      "|INVESTIGATIVE DET...|  105|\n",
      "|PETTY THEFT FROM ...|   97|\n",
      "|        FOUND PERSON|   96|\n",
      "|        STOLEN TRUCK|   89|\n",
      "|PETTY THEFT SHOPL...|   83|\n",
      "|ENROUTE TO OUTSID...|   83|\n",
      "+--------------------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# by top 20 descriptions\n",
    "data.groupBy(\"Descript\") \\\n",
    "    .count() \\\n",
    "    .orderBy(col(\"count\").desc()) \\\n",
    "    .show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer\n",
    "from pyspark.ml.classification import LogisticRegression\n",
    "\n",
    "# regular expression tokenizer\n",
    "regexTokenizer = RegexTokenizer(inputCol=\"Descript\", outputCol=\"words\", pattern=\"\\\\W\")\n",
    "\n",
    "# stop words\n",
    "add_stopwords = [\"http\",\"https\",\"amp\",\"rt\",\"t\",\"c\",\"the\"] # standard stop words\n",
    "\n",
    "stopwordsRemover = StopWordsRemover(inputCol=\"words\", outputCol=\"filtered\").setStopWords(add_stopwords)\n",
    "\n",
    "# bag of words count\n",
    "countVectors = CountVectorizer(inputCol=\"filtered\", outputCol=\"features\", vocabSize=10000, minDF=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n",
    "label_stringIdx = StringIndexer(inputCol = \"Category\", outputCol = \"label\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml import Pipeline\n",
    "\n",
    "pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])\n",
    "\n",
    "# Fit the pipeline to training documents.\n",
    "pipelineFit = pipeline.fit(data)\n",
    "dataset = pipelineFit.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------+--------------------+--------------------+--------------------+--------------------+-----+\n",
      "|      Category|            Descript|               words|            filtered|            features|label|\n",
      "+--------------+--------------------+--------------------+--------------------+--------------------+-----+\n",
      "|      WARRANTS|      WARRANT ARREST|   [warrant, arrest]|   [warrant, arrest]|(289,[18,35],[1.0...|  7.0|\n",
      "|OTHER OFFENSES|TRAFFIC VIOLATION...|[traffic, violati...|[traffic, violati...|(289,[18,25,47],[...|  2.0|\n",
      "|OTHER OFFENSES|TRAFFIC VIOLATION...|[traffic, violati...|[traffic, violati...|(289,[18,25,47],[...|  2.0|\n",
      "| LARCENY/THEFT|GRAND THEFT FROM ...|[grand, theft, fr...|[grand, theft, fr...|(289,[0,1,2,3,5],...|  0.0|\n",
      "| LARCENY/THEFT|GRAND THEFT FROM ...|[grand, theft, fr...|[grand, theft, fr...|(289,[0,1,2,3,5],...|  0.0|\n",
      "+--------------+--------------------+--------------------+--------------------+--------------------+-----+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dataset.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Dataset Count: 5185\n",
      "Test Dataset Count: 2104\n"
     ]
    }
   ],
   "source": [
    "### Randomly split data into training and test sets. set seed for reproducibility\n",
    "(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)\n",
    "print(\"Training Dataset Count: \" + str(trainingData.count()))\n",
    "print(\"Test Dataset Count: \" + str(testData.count()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Logistic Regression using Count Vector Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Build the model\n",
    "lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)\n",
    "\n",
    "# Train model with Training Data\n",
    "lrModel = lr.fit(trainingData)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|                    Descript|     Category|                   probability|label|prediction|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842314993062758,0.01659...|  0.0|       0.0|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = lrModel.transform(testData)\n",
    "\n",
    "predictions.filter(predictions['prediction'] == 0) \\\n",
    "    .select(\"Descript\",\"Category\",\"probability\",\"label\",\"prediction\") \\\n",
    "    .orderBy(\"probability\", ascending=False) \\\n",
    "    .show(n = 10, truncate = 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9610787444388802"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
    "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\")\n",
    "evaluator.evaluate(predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Logistic Regression using TF-IDF Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import HashingTF, IDF\n",
    "\n",
    "# Add HashingTF and IDF to transformation\n",
    "hashingTF = HashingTF(inputCol=\"filtered\", outputCol=\"rawFeatures\", numFeatures=10000)\n",
    "idf = IDF(inputCol=\"rawFeatures\", outputCol=\"features\", minDocFreq=5) #minDocFreq: remove sparse terms\n",
    "\n",
    "# Redo Pipeline\n",
    "pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pipelineFit = pipeline.fit(data)\n",
    "dataset = pipelineFit.transform(data)\n",
    "\n",
    "### Randomly split data into training and test sets. set seed for reproducibility\n",
    "(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)\n",
    "\n",
    "# Build the model\n",
    "lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)\n",
    "\n",
    "# Train model with Training Data\n",
    "lrModel = lr.fit(trainingData)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|                    Descript|     Category|                   probability|label|prediction|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.8842990879226498,0.01659...|  0.0|       0.0|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = lrModel.transform(testData)\n",
    "\n",
    "predictions.filter(predictions['prediction'] == 0) \\\n",
    "    .select(\"Descript\",\"Category\",\"probability\",\"label\",\"prediction\") \\\n",
    "    .orderBy(\"probability\", ascending=False) \\\n",
    "    .show(n = 10, truncate = 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9616202660247297"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\")\n",
    "evaluator.evaluate(predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9851796929217101"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])\n",
    "\n",
    "pipelineFit = pipeline.fit(data)\n",
    "dataset = pipelineFit.transform(data)\n",
    "(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)\n",
    "\n",
    "# Build the model\n",
    "lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)\n",
    "\n",
    "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
    "\n",
    "# Create ParamGrid for Cross Validation\n",
    "paramGrid = (ParamGridBuilder()\n",
    "             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter\n",
    "             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)\n",
    "#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations\n",
    "#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features\n",
    "             .build())\n",
    "\n",
    "# Create 5-fold CrossValidator\n",
    "cv = CrossValidator(estimator=lr, \\\n",
    "                    estimatorParamMaps=paramGrid, \\\n",
    "                    evaluator=evaluator, \\\n",
    "                    numFolds=5)\n",
    "\n",
    "# Run cross validations\n",
    "cvModel = cv.fit(trainingData)\n",
    "# this will likely take a fair amount of time because of the amount of models that we're creating and testing\n",
    "\n",
    "# Use test set here so we can measure the accuracy of our model on new data\n",
    "predictions = cvModel.transform(testData)\n",
    "\n",
    "# cvModel uses the best model found from the Cross Validation\n",
    "# Evaluate best model\n",
    "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\")\n",
    "evaluator.evaluate(predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Naive Bayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import NaiveBayes\n",
    "\n",
    "# create the trainer and set its parameters\n",
    "nb = NaiveBayes(smoothing=1)\n",
    "\n",
    "# train the model\n",
    "model = nb.fit(trainingData)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|                    Descript|     Category|                   probability|label|prediction|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "|GRAND THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.9999999985178225,1.05533...|  0.0|       0.0|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = model.transform(testData)\n",
    "predictions.filter(predictions['prediction'] == 0) \\\n",
    "    .select(\"Descript\",\"Category\",\"probability\",\"label\",\"prediction\") \\\n",
    "    .orderBy(\"probability\", ascending=False) \\\n",
    "    .show(n = 10, truncate = 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9625414629888848"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\")\n",
    "evaluator.evaluate(predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import RandomForestClassifier\n",
    "\n",
    "# Create an initial RandomForest model.\n",
    "rf = RandomForestClassifier(labelCol=\"label\", \\\n",
    "                            featuresCol=\"features\", \\\n",
    "                            numTrees = 100, \\\n",
    "                            maxDepth = 4, \\\n",
    "                            maxBins = 32)\n",
    "\n",
    "# Train model with Training Data\n",
    "rfModel = rf.fit(trainingData)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|                    Descript|     Category|                   probability|label|prediction|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "|PETTY THEFT FROM LOCKED AUTO|LARCENY/THEFT|[0.7544297305704795,0.04025...|  0.0|       0.0|\n",
      "+----------------------------+-------------+------------------------------+-----+----------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = rfModel.transform(testData)\n",
    "\n",
    "predictions.filter(predictions['prediction'] == 0) \\\n",
    "    .select(\"Descript\",\"Category\",\"probability\",\"label\",\"prediction\") \\\n",
    "    .orderBy(\"probability\", ascending=False) \\\n",
    "    .show(n = 10, truncate = 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6600326922344301"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\")\n",
    "evaluator.evaluate(predictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create ParamGrid for Cross Validation\n",
    "paramGrid = (ParamGridBuilder()\n",
    "             .addGrid(rf.numTrees, [50, 100, 200]) # number of trees\n",
    "             .addGrid(rf.maxDepth, [3, 4, 5]) # maximum depth\n",
    "#            .addGrid(rf.maxBins, [24, 32, 40]) #Number of bins\n",
    "             .build())\n",
    "\n",
    "# Create 5-fold CrossValidator\n",
    "cv = CrossValidator(estimator=rf, \\\n",
    "                    estimatorParamMaps=paramGrid, \\\n",
    "                    evaluator=evaluator, \\\n",
    "                    numFolds=5)\n",
    "\n",
    "# Run cross validations\n",
    "cvModel = cv.fit(trainingData)\n",
    "\n",
    "# Use test set here so we can measure the accuracy of our model on new data\n",
    "predictions = cvModel.transform(testData)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7312371927336645"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\")\n",
    "evaluator.evaluate(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
